Raw data

Dataset downloaded from mgandal’s github repository.

Load and annotate data

# Load csvs
datExpr = read.csv('./../Data/Gandal/RNAseq_ASD_datExpr.csv', row.names=1)
datMeta = read.csv('./../Data/Gandal/RNAseq_ASD_datMeta.csv')

# Group brain regions by lobes
datMeta$Brain_Region = as.factor(datMeta$Region)
datMeta$Brain_lobe = 'Occipital'
datMeta$Brain_lobe[datMeta$Brain_Region %in% c('BA4_6', 'BA9', 'BA24', 'BA44_45')] = 'Frontal'
datMeta$Brain_lobe[datMeta$Brain_Region %in% c('BA3_1_2_5', 'BA7')] = 'Parietal'
datMeta$Brain_lobe[datMeta$Brain_Region %in% c('BA38', 'BA39_40', 'BA20_37', 'BA41_42_22')] = 'Temporal'
datMeta$Brain_lobe=factor(datMeta$Brain_lobe, levels=c('Frontal', 'Temporal', 'Parietal', 'Occipital'))

# Remove '/' from Batch variable: (It is recommended (but not required) to use only letters, numbers, 
# and delimiters '_' or '.', in levels of factors as these are safe characters for column names in R
datMeta$Batch = gsub('/', '.', datMeta$RNAExtractionBatch) %>% as.factor

# Transform Diagnosis into a factor variable
datMeta$Diagnosis_ = factor(datMeta$Diagnosis_, levels=c('CTL','ASD'))

Check sample composition

Data description taken from the dataset’s synapse entry: RNAseq data was generated from 88 postmortem cortex brain samples from subjects with ASD (53 samples from 24 subjects) and non-psychiatric controls (35 samples from 17 subjects), across four cortical regions encompassing all major cortical lobes – frontal, temporal, parietal, and occipital. Brain samples were obtained from the Harvard Brain Bank as part of the Autism Tissue Project (ATP).

print(paste0('Dataset includes ', nrow(datExpr), ' genes from ', ncol(datExpr), ' samples belonging to ', length(unique(datMeta$Subject_ID)), ' different subjects.'))
## [1] "Dataset includes 63682 genes from 88 samples belonging to 41 different subjects."


Diagnosis distribution: There are more ASD samples than controls

table(datMeta$Diagnosis_)
## 
## CTL ASD 
##  35  53


Brain region distribution: All regions seem to be balanced

table(datMeta$Brain_lobe)
## 
##   Frontal  Temporal  Parietal Occipital 
##        22        20        23        23


Diagnosis and brain region seem to be balanced except for the frontal lobe, where there are more control samples than ASD ones.

table(datMeta$Diagnosis_, datMeta$Brain_lobe)
##      
##       Frontal Temporal Parietal Occipital
##   CTL      13        6        8         8
##   ASD       9       14       15        15


Sex distribution: There are many more Male samples than Female ones

table(datMeta$Sex)
## 
##  F  M 
## 15 73


Age distribution: Subjects between 5 and 60 years old with a mean close to 30

summary(datMeta$Age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2.00   17.00   28.00   29.74   41.75   60.00


Annotate genes with BioMart information


Filtering


1. Filter genes with start or end position missing

to_keep = !is.na(datGenes$length)
datGenes = datGenes[to_keep,]
datExpr = datExpr[to_keep,]
rownames(datGenes) = datGenes$ensembl_gene_id

print(paste0('Removed ', sum(!to_keep), ', ', sum(to_keep), ' remaining'))
## [1] "Removed 5, 63677 remaining"


2. Filter genes with low expression levels

\(\qquad\) 2.1 Remove genes with zero expression in all of the samples

to_keep = rowSums(datExpr)>0
datGenes = datGenes[to_keep,]
datExpr = datExpr[to_keep,]

print(paste0('Removed ', sum(!to_keep), ', ', sum(to_keep), ' remaining'))
## [1] "Removed 13720, 49957 remaining"

\(\qquad\) 2.2 Removing genes with a mean expression lower than 0.5

plot_data = data.frame('id'=rownames(datExpr), 'mean_expression' = rowMeans(datExpr))
ggplotly(plot_data %>% ggplot(aes(x=mean_expression)) + geom_density(color='#0099cc', fill='#0099cc', alpha=0.3) + 
              geom_vline(xintercept=0.5, color='gray') + scale_x_log10() + 
              ggtitle('gene Mean Expression distribution') + theme_minimal())
to_keep = rowMeans(datExpr)>0.5
datGenes = datGenes[to_keep,]
datExpr = datExpr[to_keep,]

print(paste0('Removed ', sum(!to_keep), ', ', sum(to_keep), ' remaining'))
## [1] "Removed 16764, 33193 remaining"


3. Filter outlier samples

\(\qquad\) 3.1 Gandal filters samples belonging to subject AN03345 without giving an explanation. Since it could have some technical problems, I remove them as well

to_keep = (datMeta$Subject_ID != 'AN03345')
datMeta = datMeta[to_keep,]
datExpr = datExpr[,to_keep]

print(paste0('Removed ', sum(!to_keep), ', ', sum(to_keep), ' remaining'))
## [1] "Removed 2, 86 remaining"

\(\qquad\) 3.2 Filter out outliers: Using node connectivity as a distance measure, normalising it and filtering out genes farther away than 2 standard deviations from the left (lower connectivity than average, not higher)

absadj = datExpr %>% bicor %>% abs
netsummary = fundamentalNetworkConcepts(absadj)
ku = netsummary$Connectivity
z.ku = (ku-mean(ku))/sqrt(var(ku))

plot_data = data.frame('sample'=1:length(z.ku), 'distance'=z.ku, 'Sample_ID'=datMeta$Sample_ID, 
                       'Subject_ID'=datMeta$Subject_ID, 'Extraction_Batch'=datMeta$RNAExtractionBatch,
                       'Brain_Lobe'=datMeta$Brain_lobe, 'Sex'=datMeta$Sex, 'Age'=datMeta$Age,
                       'Diagnosis'=datMeta$Diagnosis_, 'PMI'=datMeta$PMI)
selectable_scatter_plot(plot_data, plot_data[,-c(1,2)])
print(paste0('Outlier samples: ', paste(as.character(plot_data$Sample_ID[plot_data$distance< -2]), collapse=', ')))
## [1] "Outlier samples: AN01971_BA38, AN17254_BA17, AN09714_BA38, AN01093_BA7, AN02987_BA17, AN11796_BA7"
to_keep = abs(z.ku)<2
datMeta = datMeta[to_keep,]
datExpr = datExpr[,to_keep]

print(paste0('Removed ', sum(!to_keep), ', ', sum(to_keep), ' remaining'))
## [1] "Removed 6, 80 remaining"
rm(absadj, netsummary, ku, z.ku, plot_data, to_keep)
print(paste0('After filtering, the dataset consists of ', nrow(datExpr), ' genes and ', ncol(datExpr), ' samples'))
## [1] "After filtering, the dataset consists of 33193 genes and 80 samples"




Batch Effects

According to Tackling the widespread and critical impact of batch effects in high-throughput data, technical artifacts can be an important source of variability in the data, so batch correction should be part of the standard preprocessing pipeline of gene expression data.

They say Processing group and Date of the experiment are good batch surrogates, so I’m going to see if they affect the data in any clear way to use them as surrogates.

Processing group

All the information we have is the Brain Bank, and although all the samples were obtained from the Autism Tissue Project, we don’t have any more specific information about who preprocessed each sample

table(datMeta$Brain_Bank)
## 
## ATP 
##  80


Date of processing

There are two different dates when the data was procesed

table(datMeta$RNAExtractionBatch)
## 
## 10/10/2014  6/20/2014 
##         53         27

Luckily, there doesn’t seem to be a correlation between the batch surrogate and the objective variable, so the batch effect will not get confused with the Diagnosis effect

table(datMeta$RNAExtractionBatch, datMeta$Diagnosis_)
##             
##              CTL ASD
##   10/10/2014  24  29
##   6/20/2014   11  16

*All the samples from each subject were processed on the same day (makes sense, otherwise they wound need to freeze the samples)

Samples don’t seem to cluster together that strongly for each batch, although there does seem to be some kind of relation

h_clusts = datExpr %>% t %>% dist %>% hclust %>% as.dendrogram

create_viridis_dict = function(){
  min_age = datMeta$Age %>% min
  max_age = datMeta$Age %>% max
  viridis_age_cols = viridis(max_age - min_age + 1)
  names(viridis_age_cols) = seq(min_age, max_age)
  
  return(viridis_age_cols)
}
viridis_age_cols = create_viridis_dict()

dend_meta = datMeta[match(substring(labels(h_clusts),2), datMeta$Dissected_Sample_ID),] %>% 
            mutate('Batch' = ifelse(RNAExtractionBatch=='10/10/2014', '#F8766D', '#00BFC4'),
                   'Diagnosis' = ifelse(Diagnosis_=='CTL','#008080','#86b300'), # Blue control, Green ASD
                   'Sex' = ifelse(Sex=='F','#ff6666','#008ae6'),                # Pink Female, Blue Male
                   'Region' = case_when(Brain_lobe=='Frontal'~'#F8766D',        # ggplot defaults for 4 colours
                                        Brain_lobe=='Temporal'~'#7CAE00',
                                        Brain_lobe=='Parietal'~'#00BFC4',
                                        Brain_lobe=='Occipital'~'#C77CFF'),
                   'Age' = viridis_age_cols[as.character(Age)]) %>%             # Purple: young, Yellow: old
            dplyr::select(Age, Region, Sex, Diagnosis, Batch)
h_clusts %>% set('labels', rep('', nrow(datMeta))) %>% set('branches_k_color', k=9) %>% plot
colored_bars(colors=dend_meta)

rm(h_clusts, dend_meta, create_viridis_dict, viridis_age_cols)

There seems to be a different behaviour by batch mainly in the first and third principal components

pca = datExpr %>% t %>% prcomp
summary(pca)$importance[,1:3]
##                                 PC1          PC2         PC3
## Standard deviation     273215.31037 171051.12817 98195.28432
## Proportion of Variance      0.57415      0.22504     0.07416
## Cumulative Proportion       0.57415      0.79920     0.87336
plot_data = data.frame('ID'=colnames(datExpr), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2], 'PC3' = pca$x[,3]) %>% 
            mutate('ID'=substring(ID,2)) %>% left_join(datMeta, by=c('ID'='Dissected_Sample_ID')) %>% 
            mutate('Batch'=RNAExtractionBatch) %>% dplyr::select('PC1','PC2','PC3','Batch')

plot_data %>% ggpairs(progress=FALSE, aes(colour=Batch, fill=Batch, alpha=0.3)) + theme_minimal()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

rm(pca, plot_data)

Comparing the mean expression of each sample by batch we can see the batch effect is reflected here as well

plot_data_b1 = data.frame('Mean'=colMeans(datExpr[,datMeta$RNAExtractionBatch=='10/10/2014']), 'Batch'='10/10/2014')
plot_data_b2 = data.frame('Mean'=colMeans(datExpr[,datMeta$RNAExtractionBatch=='6/20/2014']), 'Batch'='6/20/2014')

plot_data = rbind(plot_data_b1, plot_data_b2)
mu = plot_data %>% group_by(Batch) %>% dplyr::summarise(BatchMean=mean(Mean))

ggplotly(plot_data %>% ggplot(aes(x=Mean, color=Batch, fill=Batch)) + geom_density(alpha=0.3) + 
         geom_vline(data=mu, aes(xintercept=BatchMean, color=Batch), linetype='dashed') +
         ggtitle('Mean expression by sample grouped by Batch') + scale_x_log10() + theme_minimal())
rm(plot_data_b1, plot_data_b2, plot_data, mu)


Looking for unknown sources of batch effects

Following the pipeline from Surrogate variable analysis: hidden batch effects where sva is used with DESeq2.

Create a DeseqDataSet object, estimate the library size correction and save the normalized counts matrix

counts = datExpr %>% as.matrix
rowRanges = GRanges(datGenes$chromosome_name,
                  IRanges(datGenes$start_position, width=datGenes$length),
                  strand=datGenes$strand,
                  feature_id=datGenes$ensembl_gene_id)
se = SummarizedExperiment(assays=SimpleList(counts=counts), rowRanges=rowRanges, colData=datMeta)
dds = DESeqDataSet(se, design = ~ Diagnosis_)

dds = estimateSizeFactors(dds)
norm.cts = counts(dds, normalized=TRUE)

Provide the normalized counts and two model matrices to SVA. The first matrix uses the biological condition, and the second model matrix is the null model.

mod = model.matrix(~ Diagnosis_, colData(dds))
mod0 = model.matrix(~ 1, colData(dds))
sva_fit = svaseq(norm.cts, mod=mod, mod0=mod0)
## Number of significant surrogate variables is:  14 
## Iteration (out of 5 ):1  2  3  4  5
rm(mod, mod0)

Found 14 surrogate variables, which seems like a lot, but since there is no direct way to select which ones to pick Bioconductor answer, kept all of them.

Include SV estimations to datMeta information

sv_data = sva_fit$sv %>% data.frame
colnames(sv_data) = paste0('SV',1:ncol(sv_data))

datMeta_sva = cbind(datMeta, sv_data)

rm(sv_data)

In conclusion: Date of extraction works as a surrogate for batch effect and the sva package found other 14 variables that could work as surrogates which are now included in datMeta and should be included in the DEA.


Normalisation and Differential Expression Analysis

Using DESeq2 package to perform normalisation. Chose this package over limma because limma uses the log transformed data as input instead of the raw counts and I have discovered that in this dataset, this transformation affects genes differently depending on their mean expression level, and genes with a high SFARI score are specially affected by this.

plot_data = data.frame('ID'=rownames(datExpr), 'Mean'=rowMeans(datExpr), 'SD'=apply(datExpr,1,sd))

plot_data %>% ggplot(aes(Mean, SD)) + geom_point(color='#0099cc', alpha=0.1) + geom_abline(color='gray') +
              scale_x_log10() + scale_y_log10() + theme_minimal()

rm(plot_data)
counts = datExpr %>% as.matrix
rowRanges = GRanges(datGenes$chromosome_name,
                  IRanges(datGenes$start_position, width=datGenes$length),
                  strand=datGenes$strand,
                  feature_id=datGenes$ensembl_gene_id)
se = SummarizedExperiment(assays=SimpleList(counts=counts), rowRanges=rowRanges, colData=datMeta_sva)
dds = DESeqDataSet(se, design = ~ Batch + SV1 + SV2 + SV3 + SV4 + SV5 + SV6 + SV7 + SV8 + SV9 + 
                                  SV10 + SV11 + SV12 + SV13 + SV14 + Diagnosis_)

# Perform DEA
dds = DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
## 18 rows did not converge in beta, labelled in mcols(object)$betaConv. Use larger maxit argument with nbinomWaldTest
DE_info = results(dds, lfcThreshold=0, altHypothesis='greaterAbs')

# Perform vst
vsd = vst(dds)

datExpr_vst = assay(vsd)
datMeta_vst = colData(vsd)
datGenes_vst = rowRanges(vsd)

rm(counts, rowRanges, se, dds, vsd)

Using the plotting function DESEq2’s manual proposes to study vst’s output it looks like the data could be homoscedastic

meanSdPlot(datExpr_vst, plot=FALSE)$gg + theme_minimal()

When plotting point by point it seems like the genes with the lowest values behave differently

plot_data = data.frame('ID'=rownames(datExpr_vst), 'Mean'=rowMeans(datExpr_vst), 'SD'=apply(datExpr_vst,1,sd))

plot_data %>% ggplot(aes(Mean, SD)) + geom_point(color='#0099cc', alpha=0.05) + 
              scale_x_log10() + scale_y_log10() + theme_minimal()

rm(plot_data)




Adjust filtering

Based on the last plot, I’m increasing the filtering threshold from mean expression > 0.5 to mean expression > 1 to remove the weird behaviour the lowest expressed genes create in the normalised data

Filtering genes with mean expression lower than 1

to_keep = rowMeans(datExpr)>1
datGenes = datGenes[to_keep,]
datExpr = datExpr[to_keep,]

print(paste0('Removed ', sum(!to_keep), ', ', sum(to_keep), ' remaining'))
## [1] "Removed 3081, 30112 remaining"
rm(to_keep)

Save filtered and annotated dataset

save(datExpr, datMeta, datGenes, file='./../Data/Gandal/filtered_raw_data.RData')
#load('./../Data/Gandal/filtered_raw_data.RData')

Repeat SVA analysis

counts = datExpr %>% as.matrix
rowRanges = GRanges(datGenes$chromosome_name,
                  IRanges(datGenes$start_position, width=datGenes$length),
                  strand=datGenes$strand,
                  feature_id=datGenes$ensembl_gene_id)
se = SummarizedExperiment(assays=SimpleList(counts=counts), rowRanges=rowRanges, colData=datMeta)
dds = DESeqDataSet(se, design = ~ Diagnosis_)

dds = estimateSizeFactors(dds)
norm.cts = counts(dds, normalized=TRUE)

mod = model.matrix(~ Diagnosis_, colData(dds))
mod0 = model.matrix(~ 1, colData(dds))
sva_fit = svaseq(norm.cts, mod=mod, mod0=mod0)
## Number of significant surrogate variables is:  13 
## Iteration (out of 5 ):1  2  3  4  5
rm(se, dds, norm.cts, mod, mod0)

Include SV estimations to datMeta information

sv_data = sva_fit$sv %>% data.frame
colnames(sv_data) = paste0('SV',1:ncol(sv_data))

datMeta_sva = cbind(datMeta, sv_data)

rm(sv_data)




And repeat Normalisation

*Four genes did not converge, so increased the maxit and one got fixed but didn’t get the other three to converge as well, so instead I removed them

# Defined counts and rowRanges in the SVA analysis

se = SummarizedExperiment(assays=SimpleList(counts=counts), rowRanges=rowRanges, colData=datMeta_sva)
dds = DESeqDataSet(se, design = ~ Batch + SV1 + SV2 + SV3 + SV4 + SV5 + SV6 + SV7 + SV8 + SV9 + 
                                  SV10 + SV11 + SV12 + SV13 + Diagnosis_)

# Perform DEA
# dds = DESeq(dds) # Had to change DESeq for its components because nbinomWaldTest was not converging
dds = estimateSizeFactors(dds)
dds = estimateDispersions(dds)
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
dds = nbinomWaldTest(dds, maxit=300)
## 5 rows did not converge in beta, labelled in mcols(object)$betaConv. Use larger maxit argument with nbinomWaldTest
# Filter three genes that did not converge
print(paste0('genes that did not converge: ', paste(rownames(datExpr)[!mcols(dds)$betaConv], collapse=', ')))
## [1] "genes that did not converge: ENSG00000171956, ENSG00000205663, ENSG00000237622, ENSG00000242384, ENSG00000261018"
dds = dds[mcols(dds)$betaConv,]

# Extract results from the analysis
DE_info = results(dds, lfcThreshold=0, altHypothesis='greaterAbs')

# Perform vst
vsd = vst(dds)

datExpr_vst = assay(vsd)
datMeta_vst = colData(vsd)
datGenes_vst = rowRanges(vsd)

rm(counts, rowRanges, se, vsd)

This plot remains stable

meanSdPlot(datExpr_vst, plot=FALSE)$gg + theme_minimal()

This one looks better now. The valley found in the original density plot of the mean expression of the genes seems to still be present here beecause there seem to be two clouds main of points

plot_data = data.frame('ID'=rownames(datExpr_vst), 'Mean'=rowMeans(datExpr_vst), 'SD'=apply(datExpr_vst,1,sd))

plot_data %>% ggplot(aes(Mean, SD)) + geom_point(color='#0099cc', alpha=0.05) + 
              scale_x_log10() + scale_y_log10() + theme_minimal()

rm(plot_data)
datExpr = datExpr_vst
datMeta = datMeta_vst %>% data.frame
datGenes = datGenes_vst

print(paste0('After filtering, the dataset consists of ', nrow(datExpr), ' genes and ', ncol(datExpr), ' samples'))
## [1] "After filtering, the dataset consists of 30107 genes and 80 samples"
rm(datExpr_vst, datMeta_vst, datGenes_vst)




Batch Effect Correction

By including the surrogate variables in the DESeq formula we only modelled the batch effects into the DEA, but we didn’t actually correct them from the data, for that we need to use ComBat (or other equivalent package) in the already normalised data

SVA surrogate variables

In some places they say you shouldn’t correct these effects on the data because you risk losing biological variation, in others they say you should because they introduce noise to the data.

Comparing data with and without surrogate variable correction

# Taken from https://www.biostars.org/p/121489/#121500
correctDatExpr = function(datExpr, mod, svs) {
  X = cbind(mod, svs)
  Hat = solve(t(X) %*% X) %*% t(X)
  beta = (Hat %*% t(datExpr))
  rm(Hat)
  gc()
  P = ncol(mod)
  return(datExpr - t(as.matrix(X[,-c(1:P)]) %*% beta[-c(1:P),]))
}

pca_samples_before = datExpr %>% t %>% prcomp
pca_genes_before = datExpr %>% prcomp

# Correct
mod = model.matrix(~ Diagnosis_, colData(dds))
svs = datMeta %>% dplyr::select(SV1:SV13) %>% as.matrix
datExpr_corrected = correctDatExpr(as.matrix(datExpr), mod, svs)

pca_samples_after = datExpr_corrected %>% t %>% prcomp
pca_genes_after = datExpr_corrected %>% prcomp

Samples

Now I understand why they say you lose the biological signal …

pca_samples_df = rbind(data.frame('ID'=colnames(datExpr), 'PC1'=pca_samples_before$x[,1],
                                  'PC2'=pca_samples_before$x[,2], 'corrected'=0),
                       data.frame('ID'=colnames(datExpr), 'PC1'=pca_samples_after$x[,1],
                                  'PC2'=pca_samples_after$x[,2], 'corrected'=1)) %>%
                 left_join(datMeta %>% mutate('ID'=rownames(datMeta)), by='ID')

ggplotly(pca_samples_df %>% ggplot(aes(PC1, PC2, color=Diagnosis_)) + geom_point(aes(frame=corrected, id=ID)) + 
         xlab(paste0('PC1 (corr=', round(cor(pca_samples_before$x[,1],pca_samples_after$x[,1]),2),
                     '). % Var explained: ', round(100*summary(pca_samples_before)$importance[2,1],1),' to ',
                     round(100*summary(pca_samples_after)$importance[2,1],1))) +
         ylab(paste0('PC2 (corr=', round(cor(pca_samples_before$x[,2],pca_samples_after$x[,2]),2),
                     '). % Var explained: ', round(100*summary(pca_samples_before)$importance[2,2],1),' to ',
                     round(100*summary(pca_samples_after)$importance[2,2],1))) +
         ggtitle('Samples') + theme_minimal())
rm(pca_samples_df)


Genes

It seems like the sv correction preserves the mean expression of the genes and erases almost everything else

*Plot is done with only 10% of the genes because it was too heavy

pca_genes_df = rbind(data.frame('ID'=rownames(datExpr), 'PC1'=pca_genes_before$x[,1],
                                'PC2'=pca_genes_before$x[,2], 'corrected'=0, 'MeanExpr'=rowMeans(datExpr)),
                     data.frame('ID'=rownames(datExpr), 'PC1'=-pca_genes_after$x[,1],
                                'PC2'=pca_genes_after$x[,2], 'corrected'=1, 'MeanExpr'=rowMeans(datExpr)))

keep_genes = rownames(datExpr) %>% sample(0.1*nrow(datExpr))

pca_genes_df = pca_genes_df %>% filter(ID %in% keep_genes)

ggplotly(pca_genes_df %>% ggplot(aes(PC1, PC2,color=MeanExpr)) + geom_point(alpha=0.3, aes(frame=corrected, id=ID)) +
         xlab(paste0('PC1 (corr=', round(cor(pca_genes_before$x[,1],pca_genes_after$x[,1]),2),
                     '). % Var explained: ', round(100*summary(pca_genes_before)$importance[2,1],1),' to ',
                     round(100*summary(pca_genes_after)$importance[2,1],1))) +
         ylab(paste0('PC2 (corr=', round(cor(pca_genes_before$x[,2],pca_genes_after$x[,2]),2),
                     '). % Var explained: ', round(100*summary(pca_genes_before)$importance[2,2],1),' to ',
                     round(100*summary(pca_genes_after)$importance[2,2],1))) +
         scale_color_viridis() + ggtitle('Genes') + theme_minimal())
rm(pca_samples_before, pca_genes_before, mod, svs, pca_samples_after, pca_genes_after, pca_samples_df, keep_genes)

Conclusion: Don’t correct the dataset for the surrogate variables obtained from sva, they do include biological signals!

Processing date

After normalisation there seems to still be a difference in the behaviour in the first PC to the processing date, but the other PC don’t seem to have a relation any more

pca = datExpr %>% t %>% prcomp
summary(pca)$importance[,1:3]
##                             PC1      PC2      PC3
## Standard deviation     36.86612 30.24649 21.41412
## Proportion of Variance  0.20336  0.13689  0.06861
## Cumulative Proportion   0.20336  0.34025  0.40887
plot_data = data.frame('ID'=colnames(datExpr), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2], 'PC3' = pca$x[,3]) %>% 
            mutate('ID'=substring(ID,2)) %>% left_join(datMeta, by=c('ID'='Dissected_Sample_ID')) %>% 
            mutate('Batch'=RNAExtractionBatch) %>% dplyr::select('PC1','PC2','PC3','Batch')

plot_data %>% ggpairs(progress=FALSE, aes(colour=Batch, fill=Batch, alpha=0.3)) + theme_minimal()

rm(pca, plot_data)

It seems like the Batch effect related to the preprocessing date might have flipped during the normalisation

plot_data_b1 = data.frame('Mean'=colMeans(datExpr[,datMeta$RNAExtractionBatch=='10/10/2014']), 'Batch'='10/10/2014')
plot_data_b2 = data.frame('Mean'=colMeans(datExpr[,datMeta$RNAExtractionBatch=='6/20/2014']), 'Batch'='6/20/2014')

plot_data = rbind(plot_data_b1, plot_data_b2)
mu = plot_data %>% group_by(Batch) %>% dplyr::summarise(BatchMean=mean(Mean))

ggplotly(plot_data %>% ggplot(aes(x=Mean, color=Batch, fill=Batch)) + geom_density(alpha=0.3) + 
         geom_vline(data=mu, aes(xintercept=BatchMean, color=Batch), linetype='dashed') +
         ggtitle('Mean expression by sample grouped by processing date') + scale_x_log10() + theme_minimal())
rm(plot_data_b1, plot_data_b2, plot_data)


Performing Batch Correction for processing date

https://support.bioconductor.org/p/50983/

datExpr = datExpr %>% as.matrix %>% ComBat(batch=datMeta$Batch)
## Found2batches
## Adjusting for0covariate(s) or covariate level(s)
## Standardizing Data across genes
## Fitting L/S model and finding priors
## Finding parametric adjustments
## Adjusting the Data

Plots grouping by processing date changed although I’m not sure if it’s for better or for worse?

pca = datExpr %>% t %>% prcomp
summary(pca)$importance[,1:3]
##                             PC1      PC2      PC3
## Standard deviation     35.82860 29.66247 21.37263
## Proportion of Variance  0.19971  0.13688  0.07106
## Cumulative Proportion   0.19971  0.33659  0.40766
plot_data = data.frame('ID'=colnames(datExpr), 'PC1' = pca$x[,1], 'PC2' = pca$x[,2], 'PC3' = pca$x[,3]) %>% 
            mutate('ID'=substring(ID,2)) %>% left_join(datMeta, by=c('ID'='Dissected_Sample_ID')) %>% 
            mutate('Batch'=RNAExtractionBatch) %>% dplyr::select('PC1','PC2','PC3','Batch')

plot_data %>% ggpairs(progress=FALSE, aes(colour=Batch, fill=Batch, alpha=0.3)) + theme_minimal()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

rm(pca, plot_data)

Both batches now have almost the same mean expression

plot_data_b1 = data.frame('Mean'=colMeans(datExpr[,datMeta$RNAExtractionBatch=='10/10/2014']), 'Batch'='10/10/2014')
plot_data_b2 = data.frame('Mean'=colMeans(datExpr[,datMeta$RNAExtractionBatch=='6/20/2014']), 'Batch'='6/20/2014')

plot_data = rbind(plot_data_b1, plot_data_b2)
mu = plot_data %>% group_by(Batch) %>% dplyr::summarise(BatchMean=mean(Mean))

ggplotly(plot_data %>% ggplot(aes(x=Mean, color=Batch, fill=Batch)) + geom_density(alpha=0.3) + 
         geom_vline(data=mu, aes(xintercept=BatchMean, color=Batch), linetype='dashed') +
         ggtitle('Mean expression by sample grouped by processing date') + scale_x_log10() + theme_minimal())
rm(plot_data_b1, plot_data_b2, plot_data)



Save preprocessed dataset

save(datExpr, datMeta, datGenes, DE_info, dds, file='./../Data/Gandal/preprocessed_data.RData')


Load SFARI Genes and annotate them with ensembl IDs

# Load SFARI information
SFARI_genes = read_csv('./../Data/SFARI/SFARI_genes_01-15-2019.csv')

# Get ensemble IDS for SFARI genes
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL',
               dataset='hsapiens_gene_ensembl',
               host='feb2014.archive.ensembl.org') ## Gencode v19

gene_names = getBM(attributes=c('ensembl_gene_id', 'hgnc_symbol'), filters=c('hgnc_symbol'), 
                   values=SFARI_genes$`gene-symbol`, mart=mart) %>% 
                   mutate('gene-symbol'=hgnc_symbol, 'ID'=as.character(ensembl_gene_id)) %>% 
                   dplyr::select('ID', 'gene-symbol')

SFARI_genes = left_join(SFARI_genes, gene_names, by='gene-symbol') %>% distinct(ID, .keep_all=T)

write.csv(SFARI_genes, './../Data/SFARI/SFARI_genes_with_ensembl_IDs.csv', row.names=F)




Session info

sessionInfo()
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-redhat-linux-gnu (64-bit)
## Running under: Scientific Linux 7.5 (Nitrogen)
## 
## Matrix products: default
## BLAS/LAPACK: /usr/lib64/R/lib/libRblas.so
## 
## locale:
##  [1] LC_CTYPE=en_GB.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_GB.UTF-8        LC_COLLATE=en_GB.UTF-8    
##  [5] LC_MONETARY=en_GB.UTF-8    LC_MESSAGES=en_GB.UTF-8   
##  [7] LC_PAPER=en_GB.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_GB.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] parallel  stats4    stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] hexbin_1.27.2               dendextend_1.10.0          
##  [3] vsn_3.50.0                  WGCNA_1.66                 
##  [5] fastcluster_1.1.25          dynamicTreeCut_1.63-1      
##  [7] sva_3.30.1                  genefilter_1.64.0          
##  [9] mgcv_1.8-24                 nlme_3.1-137               
## [11] DESeq2_1.22.2               SummarizedExperiment_1.12.0
## [13] DelayedArray_0.8.0          BiocParallel_1.16.6        
## [15] matrixStats_0.54.0          Biobase_2.42.0             
## [17] GenomicRanges_1.34.0        GenomeInfoDb_1.18.2        
## [19] IRanges_2.16.0              S4Vectors_0.20.1           
## [21] BiocGenerics_0.28.0         biomaRt_2.38.0             
## [23] ggExtra_0.8                 GGally_1.4.0               
## [25] gridExtra_2.3               viridis_0.5.1              
## [27] viridisLite_0.3.0           RColorBrewer_1.1-2         
## [29] plotlyutils_0.0.0.9000      plotly_4.8.0               
## [31] glue_1.3.1                  reshape2_1.4.3             
## [33] forcats_0.3.0               stringr_1.4.0              
## [35] dplyr_0.8.0.1               purrr_0.3.1                
## [37] readr_1.3.1                 tidyr_0.8.3                
## [39] tibble_2.1.1                ggplot2_3.1.0              
## [41] tidyverse_1.2.1            
## 
## loaded via a namespace (and not attached):
##   [1] readxl_1.1.0           backports_1.1.2        Hmisc_4.1-1           
##   [4] plyr_1.8.4             lazyeval_0.2.2         splines_3.5.1         
##   [7] crosstalk_1.0.0        robust_0.4-18          digest_0.6.18         
##  [10] foreach_1.4.4          htmltools_0.3.6        GO.db_3.7.0           
##  [13] magrittr_1.5           checkmate_1.8.5        memoise_1.1.0         
##  [16] fit.models_0.5-14      cluster_2.0.7-1        doParallel_1.0.14     
##  [19] limma_3.38.3           annotate_1.60.1        modelr_0.1.4          
##  [22] prettyunits_1.0.2      colorspace_1.4-1       blob_1.1.1            
##  [25] rvest_0.3.2            rrcov_1.4-3            haven_1.1.1           
##  [28] xfun_0.5               crayon_1.3.4           RCurl_1.95-4.10       
##  [31] jsonlite_1.6           impute_1.56.0          survival_2.42-3       
##  [34] iterators_1.0.9        gtable_0.2.0           zlibbioc_1.28.0       
##  [37] XVector_0.22.0         kernlab_0.9-27         prabclus_2.2-7        
##  [40] DEoptimR_1.0-8         scales_1.0.0           mvtnorm_1.0-7         
##  [43] DBI_1.0.0              miniUI_0.1.1.1         Rcpp_1.0.1            
##  [46] xtable_1.8-3           progress_1.2.0         htmlTable_1.11.2      
##  [49] mclust_5.4             foreign_0.8-70         bit_1.1-14            
##  [52] preprocessCore_1.44.0  Formula_1.2-3          htmlwidgets_1.3       
##  [55] httr_1.4.0             fpc_2.1-11.1           modeltools_0.2-22     
##  [58] acepack_1.4.1          flexmix_2.3-15         pkgconfig_2.0.2       
##  [61] reshape_0.8.7          XML_3.98-1.11          nnet_7.3-12           
##  [64] locfit_1.5-9.1         labeling_0.3           tidyselect_0.2.5      
##  [67] rlang_0.3.2            later_0.8.0            AnnotationDbi_1.44.0  
##  [70] munsell_0.5.0          cellranger_1.1.0       tools_3.5.1           
##  [73] cli_1.1.0              generics_0.0.2         RSQLite_2.1.1         
##  [76] broom_0.5.1            evaluate_0.13          yaml_2.2.0            
##  [79] knitr_1.22             bit64_0.9-7            robustbase_0.93-0     
##  [82] whisker_0.3-2          mime_0.6               xml2_1.2.0            
##  [85] compiler_3.5.1         rstudioapi_0.7         curl_3.3              
##  [88] affyio_1.52.0          geneplotter_1.60.0     pcaPP_1.9-73          
##  [91] stringi_1.4.3          trimcluster_0.1-2.1    lattice_0.20-35       
##  [94] Matrix_1.2-14          pillar_1.3.1           BiocManager_1.30.4    
##  [97] data.table_1.12.0      bitops_1.0-6           httpuv_1.5.0          
## [100] affy_1.60.0            R6_2.4.0               latticeExtra_0.6-28   
## [103] promises_1.0.1         codetools_0.2-15       MASS_7.3-50           
## [106] assertthat_0.2.1       withr_2.1.2            GenomeInfoDbData_1.2.0
## [109] diptest_0.75-7         hms_0.4.2              grid_3.5.1            
## [112] rpart_4.1-13           class_7.3-14           rmarkdown_1.12        
## [115] Cairo_1.5-9            shiny_1.2.0            lubridate_1.7.4       
## [118] base64enc_0.1-3